# twitter library
library(rtweet)
# plotting and pipes - tidyverse
library(ggplot2)
library(dplyr, warn.conflicts = FALSE)
options(dplyr.summarise.inform = FALSE)
# text mining library
suppressPackageStartupMessages(library(tidyverse))
# date/time libaray
library(lubridate, warn.conflicts = FALSE)
# twitter library
library(rtweet)
#Twitter Screenshot
library(tweetrmd)
# text mining library
suppressPackageStartupMessages(library(tidyverse)) # suppress startup message
library(tidytext)
# stemming libary
library(SnowballC)
# lemmatization
library(textstem)
library(plotrix)
library(radarchart)
library(choroplethr)
library(choroplethrMaps)
library(reshape2)
library(wordcloud)
library(ggraph)
df = read_twitter_csv("final_tweets.csv")
# Sources: [3](https://www.rdocumentation.org/packages/countrycode/versions/0.6/topics/countrycode)
data(country.regions) # dataset that contains country names in different versions from choroplethr
countryname<-as.data.frame(country.regions) #convert it as a dataframe
sprintf('dataset now has %s rows and %s columns', nrow(df), ncol(df))
## [1] "dataset now has 49473 rows and 90 columns"
# Convert the date field to a datetime
df$created_at <- as_datetime(df$created_at)
#Changed some fields to factors for easier manipulation later
df$status_id <- as.factor(df$status_id)
df$screen_name <- as.factor(df$screen_name)
df$country_code <- as.factor(df$country_code)
df$hashtags <- as.factor(df$hashtags)
df$urls_expanded_url <- as.factor(df$urls_expanded_url)
# Fix up the reply count field. It should be a int and NAs set to 0
df$reply_count[is.na(df$reply_count)] <- 0
df$reply_count <- as.integer(df$reply_count)
# add document id
df = df %>%
mutate(doc_id = paste0("doc", row_number())) %>%
select(doc_id, everything())
# add text len
df = df %>%
mutate(text_len = str_count(text))
df1 <- df %>%
select(doc_id, status_id, created_at, screen_name, text, retweet_count, hashtags, text_len, favourites_count, country_code)
str(df1)
## tibble [49,473 × 10] (S3: tbl_df/tbl/data.frame)
## $ doc_id : chr [1:49473] "doc1" "doc2" "doc3" "doc4" ...
## $ status_id : Factor w/ 48891 levels "1194947193721040898",..: 48891 7343 48890 48889 48888 48887 31614 32617 42592 21409 ...
## $ created_at : POSIXct[1:49473], format: "2020-12-10 21:30:25" "2020-12-09 00:17:48" ...
## $ screen_name : Factor w/ 30739 levels "___SLASH_____",..: 4033 4033 26372 27373 29610 12528 12528 12528 12528 12528 ...
## $ text : chr [1:49473] "Canada approved the vaccine \"after a thorough, independent review.\"\n\nthe initial approval from the Canadian"| __truncated__ "in Britain\n\n800,000 doses of #COVID19Vaccine will be dispensed in the coming weeks\n\nup to four million more"| __truncated__ "Vaccine development is only half the hurdle. Learn about the challenges of #COVID19vaccine distribution and ado"| __truncated__ "@pfizer hallo, did your phase 3 trial of the #COVID19Vaccine include subjects with SLE/Lupus, antiphospholipid "| __truncated__ ...
## $ retweet_count : int [1:49473] 0 1 0 0 1 0 0 0 0 0 ...
## $ hashtags : Factor w/ 24856 levels "_Irresponsible_gov COVID19",..: 11040 11040 11239 11142 17163 5338 3233 11387 11387 12625 ...
## $ text_len : int [1:49473] 301 272 278 280 230 161 109 184 221 154 ...
## $ favourites_count: int [1:49473] 58952 58952 62 20121 1779 852 852 852 852 852 ...
## $ country_code : Factor w/ 59 levels "AE","AL","AR",..: NA NA NA NA NA NA NA NA NA NA ...
head(df1)
tail(df1)
duplicates <- df1[duplicated(df1$text),]
sprintf('number of duplicate text values %d', nrow(duplicates))
## [1] "number of duplicate text values 884"
duplicates
df1 <- df1[!duplicated(df1$text),]
sprintf('number of unique text values %d', nrow(df1))
## [1] "number of unique text values 48589"
summary(df1$text_len)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.0 146.0 217.0 207.3 275.0 959.0
boxplot(df1$text_len,
ylab = "text_len")
out <- boxplot.stats(df1$text_len)$out
out_ind <- which(df1$text_len %in% c(out))
outliers <- df1[out_ind, ]
outliers %>%
select(doc_id, text_len, screen_name, status_id) %>%
arrange(-text_len)
df1 %>%
filter(doc_id %in% c('doc1974')) %>%
select(doc_id, text) %>%
c()
## $doc_id
## [1] "doc1974"
##
## $text
## [1] "@VadodaraAirport @marionste @BMcB77437937 @StaleSonnen @puretruthcouk @sue91282690 @ParkinJim @BreezerGalway @dontbbamboozled @Nemeses667 @MikeTitchard @drjonesaa @DrUmeshPrabhu @Peterbojangles @Sigbertganser @simplelogical @adrianshort @lexilowdon @LesleyStock5 @AsItIs13658173 @KateShemirani @DocBastard @BillieHuman @honeylyttle1 @AliBeckZeck @maxinecisneros @bruceppdl @cjsnowdon @WhatNowDoc @NHSwhistleblowr @MancunianMEDlC @ann_poppy @ActCarers @Jarmann @docgwyn2 @Phoebejoy1611 @jones_celia @DrSandvika @kateheydonorg @EmergMedDr @MOReilly01 @Ms49533001 @normanlamb @SueAllison809 @WingedPsyche @TV_HIEC_Chair @garethicke @gmcuk @drcolinm @tcgannon It seems like many doctors and nurses don't want the #COVID19Vaccine but yet they want us to have it.\n\nFor the past 9 months we've heard nothing but 'protect the NHS' 'We don't want to overload the NHS' etc etc etc. Now all of a sudden NHS staff are backing off from the vaccine. https://t.co/0U0NhsYGQV"
#[4](https://github.com/gadenbuie/tweetrmd)
include_tweet("https://twitter.com/ang__johnson/status/1337015660136833027")
It seems like many doctors and nurses don't want the #COVID19Vaccine but yet they want us to have it.
— Justice For Alice! (@ang__johnson) December 10, 2020
For the past 9 months we've heard nothing but 'protect the NHS' 'We don't want to overload the NHS' etc etc etc. Now all of a sudden NHS staff are backing off from the vaccine. pic.twitter.com/0U0NhsYGQV
df1 %>%
filter(doc_id %in% c('doc32363')) %>%
select(doc_id, text) %>%
c()
## $doc_id
## [1] "doc32363"
##
## $text
## [1] "@sweet_iced_T @sarahbeth0404 @sinnndy1 @Maddog4Biden @Strat14050941 @REDGRRRL1 @resistorgirl2 @katibug817 @PukeonTrump @ZACKHAMMER7 @BidenIsMyPOTUS @AMPMTALK @bmcarthur17 @Rubicon1313 @Kelleyrose20 @troykeasling @PaulDereume @savage_purpose @fingerbobs @june_heinz @beinggerric @nihilismo7 @MattInCincy513 @ButtersKatz @Cathereni @PetraMcCarron2 @bidens_girl @BernieBelieve @jolia_pati @GeoffMotchan @sylviacanty @LanceUSA70 @CatherineResist @FairlyNiceLady @AllTransLivesM1 @Tlovett7 @2bears150 @LuluDowney2 @_tokyo_kiwi_ @Bentcat700Tx @KingRezizt @CupcakesForYou7 @PattyCross2160 @Juliethewarrior @Henness87 @SARA2001NOOR @Metsmania1 @ZuzuBriar @SipaSpellBatCat @Just_ReneaR Wednesday Quagmire of #DopeyDon:\n\n-Top Pentagon Official resigns citing Trump putting Nation @ serious risk\n-TX AG sues 4 Swing States in last gasp \n-Holds Vaccine Presser, what a joke\n-Lying,People Dying\n-#COVID19 290K+Dead 15.2M+Infected\n\n#UNMAGA\n#HumptyTrumpty\n#TrumpIsPathetic"
include_tweet("https://twitter.com/CupofJoeintheD2/status/1336801360461881346")
Wednesday Quagmire of #DopeyDon:
— TheCupofJoeintheD (@CupofJoeintheD2) December 9, 2020
-Top Pentagon Official resigns citing Trump putting Nation @ serious risk
-TX AG sues 4 Swing States in last gasp
-Holds Vaccine Presser, what a joke
-Lying,People Dying
-#COVID19 290K+Dead 15.2M+Infected#UNMAGA#HumptyTrumpty#TrumpIsPathetic
countries_data <- df1 %>%
filter(is.na(country_code) ==FALSE) %>%
rename(iso2c = country_code) %>%
left_join(countryname) %>%
count(region,sort = TRUE) %>%
rename(value = n) %>%
select(region, value)
countries_data
length(unique(df1$country_code))
## [1] 60
df1 %>%
count(country_code, sort = TRUE) %>%
mutate(country_code = reorder(country_code, n)) %>%
slice_max( order_by=country_code, n = 10) %>%
ggplot(aes(x = country_code, y = n)) +
geom_col(aes(fill = country_code)) +
geom_text(aes(label = n, hjust=1), size = 3.5, color = "black") +
coord_flip() +
labs(x = "Location",
y = "Count") +
ggtitle("Top Locations") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
#[5](https://www.r-bloggers.com/2017/03/advanced-choroplethr-changing-color-scheme-2/)
labs <- data.frame(region =tail(countries_data[order(countries_data$value),])$region)
# Left joining by region with our original dataset
nplotdata <- countries_data %>% left_join(labs)
# Visualise Map
country_choropleth(countries_data, num_colors = 1) +
scale_fill_gradient(high = "#e34a33", low = "#fee8c8", #set color by stats
guide ="colorbar", na.value="white", name="Counts of Tweets") +
ggtitle("Tweets by Country") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
summary(df1)
## doc_id status_id created_at
## Length:48589 1194947193721040898: 1 Min. :2019-11-14 11:56:26
## Class :character 1194984725464653830: 1 1st Qu.:2020-12-09 16:12:00
## Mode :character 1195336939345502208: 1 Median :2020-12-09 20:45:00
## 1195386957578326016: 1 Mean :2020-12-03 09:16:47
## 1196461952190631938: 1 3rd Qu.:2020-12-10 16:19:31
## 1196751184557674496: 1 Max. :2020-12-10 21:30:25
## (Other) :48583
## screen_name text retweet_count
## coronaviruscare: 1087 Length:48589 Min. : 0.00
## bitcoinconnect : 357 Class :character 1st Qu.: 0.00
## openletterbot : 220 Mode :character Median : 0.00
## GlobalPandemics: 180 Mean : 7.26
## OxfordVacGroup : 171 3rd Qu.: 1.00
## COVIDLive : 149 Max. :49301.00
## (Other) :46425
## hashtags text_len favourites_count
## COVID19 : 8620 Min. : 14.0 Min. : 0
## COVID19Vaccine : 3404 1st Qu.:146.0 1st Qu.: 295
## Covid19 : 749 Median :217.0 Median : 2063
## covid19 : 613 Mean :207.3 Mean : 15654
## COVID19 COVID19Vaccine: 336 3rd Qu.:275.0 3rd Qu.: 9339
## (Other) :33914 Max. :959.0 Max. :876164
## NA's : 953
## country_code
## US : 691
## GB : 340
## CA : 139
## IN : 94
## PK : 64
## (Other): 215
## NA's :47046
ggplot(data = df1, aes(x = wday(created_at, label = TRUE))) +
geom_bar(aes(fill = ..count..)) +
xlab('Day of the week') + ylab('Number of tweets') +
theme_minimal() +
scale_fill_gradient(low = 'orange', high = 'blue') +
ggtitle("Day Of The Week With The Most Tweets in The World") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
df1 %>%
filter(country_code == 'US' & is.na(country_code) == F) %>%
ggplot(aes(x = wday(created_at, label = TRUE))) +
geom_bar(aes(fill = ..count..)) +
xlab('Day of the week') + ylab('Number of tweets') +
theme_minimal() +
scale_fill_gradient(low = 'orange', high = 'blue') +
ggtitle("Day Of The Week With The Most Tweets in the US") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
df1 %>%
count(screen_name, sort = TRUE) %>%
mutate(screen_name = reorder(screen_name, n)) %>%
slice_max( order_by=screen_name, n = 10) %>%
ggplot(aes(x = screen_name, y = n)) +
geom_col(aes(fill = screen_name)) +
geom_text(aes(label = n, hjust=1), size = 3.5, color = "black") +
coord_flip() +
labs(x = "Users",
y = "Count") +
ggtitle("Top Users") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
favorite<- df1 %>%
arrange(desc(favourites_count)) %>%
select(created_at, screen_name, status_id ,text,favourites_count) %>%
top_n(5)
favorite
include_tweet("https://twitter.com/CrankyCyborg/status/1337123816472936449")
@rudyguiIiani gets 1 of only 108 special drugs for #COVID19 & does he feel any shame for jumping ahead of anyone or change his tune on masks?
— Matt is (@CrankyCyborg) December 10, 2020
No, he goes on tv (via phone in) & continues to poopoo wearing masks etc & says hey just get early treatment, no biggie!🖕@rudyguiIiani 🖕 https://t.co/weN9gO8BYN
favorite_us <- df1 %>%
filter(country_code == 'US' & is.na(country_code) == FALSE) %>%
arrange(desc(favourites_count)) %>%
select(screen_name, status_id,text,favourites_count) %>%
top_n(5)
favorite_us
include_tweet("https://twitter.com/shortwave8669/status/1336686146542301190")
For Dec. 9th #COVID19Vaccine
— Howard Roark (@shortwave8669) December 9, 2020
new cases new deaths
US 215,586 2,534
Canada 6,171 91
Germany 29,263 568
S. Korea 677 4#TrumpFailedAmerica
retweeted<- df1 %>%
arrange(desc(retweet_count)) %>%
select(screen_name, created_at, status_id,text,retweet_count)
retweeted
#[6](https://medium.com/@traffordDataLab/exploring-tweets-in-r-54f6011a193d)
ts_plot(retweeted, "weekly") +
labs(x = NULL, y = NULL,
title = "Frequency of Weekly Retweets in the World",
subtitle = paste0(format(min(retweeted$created_at), "%d %B %Y"), " to ", format(max(retweeted$created_at),"%d %B %Y"))) +
scale_y_log10() +
theme_minimal()
include_tweet("https://twitter.com/coronaviruscare/status/1243603938676486146")
During this crisis, our grocery store clerks, delivery drivers, transit and utility workers—along with so many others—have been selflessly getting up every day to make sure we have the things we need. And for that, we say thank you.https://t.co/Fq5qIRXt03
— Barack Obama (@BarackObama) March 27, 2020
retweeted_us<-df1 %>%
filter(country_code == 'US' & is.na(country_code) == FALSE) %>%
arrange(desc(retweet_count)) %>%
select(screen_name, created_at, status_id, text,retweet_count)
retweeted_us
ts_plot(retweeted_us, "hourly") +
labs(x = NULL, y = NULL,
title = "Frequency of US Retweets in an hour",
subtitle = paste0(format(min(retweeted_us$created_at), "%d %B %Y"), " to ", format(max(retweeted_us$created_at),"%d %B %Y"))) +
scale_y_log10()+
theme_minimal()
# Top #'s
df1 %>%
count(hashtags , sort = TRUE) %>%
mutate(hashtags = reorder(hashtags , n)) %>%
slice_max( order_by=hashtags , n = 7) %>%
ggplot(aes(x = hashtags , y = n)) +
geom_bar(stat = 'identity', aes(fill = hashtags)) +
geom_text(aes(label=n), position=position_dodge(width=0.9), vjust=-0.25)+
theme(axis.text.x = element_text(angle = 30, hjust = 1),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position="none") +
labs(x = "Hashtags",
y = "Count ") +
ggtitle("Top Hashtag in the World") +
theme(legend.position = 'none', plot.title = element_text(size=18, face = 'bold'),
axis.text=element_text(size=12),
axis.title=element_text(size=16,face="bold"))
df1 %>%
filter(country_code == 'US' & is.na(country_code) == F) %>%
count(hashtags , sort = TRUE) %>%
mutate(hashtags = reorder(hashtags , n)) %>%
slice_max( order_by=hashtags , n = 7) %>%
ggplot(aes(x = hashtags , y = n)) +
geom_bar(stat = 'identity', aes(fill = hashtags)) +
geom_text(aes(label=n), position=position_dodge(width=0.9), vjust=-0.25)+
theme(axis.text.x = element_text(angle = 30, hjust = 1),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position="none") +
labs(x = "Hashtags",
y = "Count ") +
ggtitle("Top Hashtag in The US") +
theme(legend.position = 'none', plot.title = element_text(size=18, face = 'bold'),
axis.text=element_text(size=12),
axis.title=element_text(size=16,face="bold"))
#[7](https://www.red-gate.com/simple-talk/sql/bi/text-mining-and-sentiment-analysis-with-r/)
cleandf = df1[-grep("http\\S+\\s*", df1$text),]
cleandf = cleandf[-grep("\\b\\d+\\b", cleandf$text),]
cleandf = cleandf[-grep('t.co', cleandf$text),]
cleandf = cleandf[-grep('amp', cleandf$text),]
my_stop_words <- tibble(
word = c( "t.co", "rt", "amp", "gt", "shit", "damm", "wow", "fuck", "fucker", "covid19vaccine", "positive", "trump", "william", "shakespeare"),
lexicon = "twitter"
)
all_stop_words <- stop_words %>%
bind_rows(my_stop_words)
# tidying and remove stop words
tidy_df = cleandf %>%
unnest_tokens(word, text) %>%
anti_join(all_stop_words, by='word') %>%
mutate(word = lemmatize_words(word))
tidy_df$word <- gsub("\\s+","", tidy_df$word)
tidy_df
frequency_global <- tidy_df %>%
count(word, sort=TRUE)
#get the top 10 words
frequency_global %>%
top_n(10)
#[7](https://www.red-gate.com/simple-talk/sql/bi/text-mining-and-sentiment-analysis-with-r/)
frequency_global[1:10,] %>%
ggplot(aes(x = word, y = n)) +
geom_col(aes(x = reorder(word, n) ,n, fill= word)) +
geom_text(aes(label = n, hjust=1), size = 3.5, color = "black") +
coord_flip() +
ggplot2::labs(
x = "Word",
y = NULL) +
ggtitle("Word Freq in the World") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
tidy_us <- tidy_df[is.na(tidy_df$country_code)==FALSE & tidy_df$country_code == "US", ]
frequency_us <- tidy_us %>%
count(word, sort=TRUE)
#top 10 words
frequency_us %>%
top_n(10)
frequency_us[1:10,] %>%
ggplot(aes(x = word, y = n)) +
geom_col(aes(x = reorder(word, n) ,n, fill= word)) +
geom_text(aes(label = n, hjust=1), size = 3.5, color = "black") +
coord_flip() +
ggplot2::labs(
x = "Word",
y = NULL) +
ggtitle("Word Freq in the US") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
#[8](https://www.tutorialspoint.com/r/r_pie_charts.htm)
tweets_bing<-tidy_df%>%
# Implement sentiment analysis using the "bing" lexicon
inner_join(get_sentiments("bing"))
perc<-tweets_bing%>%
count(sentiment)%>%
mutate(total=sum(n)) %>%
group_by(sentiment) %>%
mutate(percent=round(n/total,2)*100) %>%
ungroup()
label <-c( paste(perc$percent[1],'%',' - ',perc$sentiment[1],sep=''),
paste(perc$percent[2],'%',' - ',perc$sentiment[2],sep=''))
pie3D(perc$percent,labels=label,labelcex=1.1,explode= 0.1,
main="Worldwide Sentiment")
#[9]https://www.tidytextmining.com/twitter.html#comparing-word-usage
top_words <- tweets_bing %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n))
#plot the result
ggplot(top_words, aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n, hjust=1), size = 3.5, color = "black") +
facet_wrap(~sentiment, scales = "free") +
coord_flip() +
ggtitle("Most Common Positive and Negative words (World)") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
#[10](https://www.tidytextmining.com/twitter.html#comparing-word-usage)
tidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#1b2a49", "#00909e"),
max.words = 100)
top_words_us <- tidy_us %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n))
#plot the result above
ggplot(top_words_us, aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n, hjust=1), size = 3.5, color = "black") +
facet_wrap(~sentiment, scales = "free") +
coord_flip() +
ggtitle("Most common positive and negative words (US)") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
#Sentiment ranking list
nrc_words <- tidy_df %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(sentiment,sort = TRUE) %>%
mutate(percent=round(100*n/sum(n))) %>%
select(sentiment, percent)
nrc_words
nrc_words %>%
ggplot(aes(x = sentiment, y = percent)) +
geom_col(aes(x = reorder(sentiment, percent) ,percent, fill= sentiment)) +
geom_text(aes(label = percent, hjust=1), size = 3.5, color = "black") +
coord_flip() +
ggplot2::labs(
x = "Sentiment",
y = " Percentage % ") +
ggtitle("Emotions of Tweets in the World") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
tidy_df %>%
inner_join(get_sentiments("nrc")) %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(word,sentiment) %>%
group_by(sentiment) %>%
top_n(5) %>%
ungroup() %>%
mutate(word=reorder(word,n)) %>%
ggplot(aes(x=word,y=n,fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
ggtitle(label = "Sentiment Word Frequency (World)") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
#Sentiment ranking list
nrc_words_us <- tidy_us %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(sentiment,sort = TRUE) %>%
mutate(percent=round(100*n/sum(n))) %>%
select(sentiment, percent)
nrc_words_us
nrc_words_us %>%
ggplot(aes(x = sentiment, y = percent)) +
geom_col(aes(x = reorder(sentiment, percent) ,percent, fill= sentiment)) +
geom_text(aes(label = percent, hjust=1), size = 3.5, color = "black") +
coord_flip() +
ggplot2::labs(
x = "Sentiment",
y = " Percentage % ") +
ggtitle("Emotions of Tweets in the US") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
tidy_us %>%
inner_join(get_sentiments("nrc")) %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(word,sentiment) %>%
group_by(sentiment) %>%
top_n(5) %>%
ungroup() %>%
mutate(word=reorder(word,n)) %>%
ggplot(aes(x=word,y=n,fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
ggtitle(label = "Sentiment Word Frequency (US)") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))
bigrams_1 <- tidy_df %>%
unnest_tokens(bigram, word, token = "ngrams", n=2)
bigrams_1
bigrams_separated <- bigrams_1 %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!(is.na(word1) | is.na(word2))) %>%
count(word1, word2, sort = TRUE) %>%
head(15)
bigrams_filtered
#[11](https://bookdown.org/Maxine/tidy-text-mining/tokenizing-by-n-gram.html)
arrow <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigrams_filtered, layout = "fr") +
geom_edge_link(aes(alpha = n), show.legend = FALSE,
arrow = arrow, end_cap = circle(0.07, "inches")) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
ggtitle("Top 15 Bigrams") +
theme(plot.title = element_text(size = 14, face = "bold",hjust = 0.5))